/**********************************************************************/
/*
   Author: Robbie Dulin, Michelle Han
   Created: 5 May 2021
   Description: Cleans raw August 2019 SAKERNAS data.

   Last update: Apr 2024
   This cleaning file should output 2 different datasets:
   1. Cleaned SAKERNAS data:
   sak_aug19_deid_clean
   2. Subset of SAKERNAS person-batch data matched with PMO for SAKERNAS analysis:
  sak_aug19_deid_clean_merged

*/
/**********************************************************************/
  * include filepaths 
  if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
  * Log
  cap log close
  global prefix: display %tdCYND td(`c(current_date)')
  log using "$KP_logs/${prefix}_clean_SAKERNAS_aug2019.txt", text replace
  
  clear
  set more off
  
* Load data
  u "$KP_deid_sakernas/Raw/SAKERNAS_PRAKERJA_19AUG_deid.dta", clear
  di _N
  count if tahun == . // NOTE from Michelle: why?
  drop if tahun == .
  di _N

/*----------------------------------------------------*/
              /* Section: Demographics */
/*----------------------------------------------------*/

* NOTE: no HH id in this survey
* number of HH members
  gen hh_size = b2_r1

* year of birth
  gen year_dob = b4_k7_th

* age
  summ b4_k8
  gen age = b4_k8
  recode age (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat)

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 

* relationship to HH head
  tab b4_k3, m
  gen relation_hh_head = b4_k3
  la def relations 1 "HH head" 2 "Spouse" 3 "Son/Daughter" 4 "Step/adopted child" 5 "Son/Daughter-in-law" 6 "Grandchild" 7 "Parent/Parent-in-law" 8 "Other family" 9 "Housemaid" 10 "Driver/Gardener" 11 "No relation", replace
  la val relation_hh_head relations
  tab relation_hh_head

* marital status
  tab b4_k10, m
  gen married = b4_k10 == 2
  gen divorced = b4_k10 == 3
  gen widowed = b4_k10 == 4
  gen single = b4_k10 == 1

* gender
  tab b4_k6, m
  gen female = b4_k6 == 2

  gen sex = b4_k6 == 1
  label def sex 1 "male"  0 "female"
  label val sex sex

* current student
  tab b4_k9, m
  gen current_student = b4_k9 == 2

* education levels
  tab b5_r1a, m
  gen no_elementary = b5_r1a == 1
  gen elementary = 2 <= b5_r1a & b5_r1a <= 4
  gen junior_high = 5 <= b5_r1a & b5_r1a <= 7
  gen high_school = 8 <= b5_r1a & b5_r1a <= 11
  gen tertiary = 12 <= b5_r1a & b5_r1a <= 16
  summ no_elementary elementary junior_high high_school tertiary

* DH: tried to uniformize with au20
  gen educ = b5_r1a
	gen school_years = 3 if educ == 1 // No elementary
	replace school_years = 6 if inlist(educ, 2, 3, 4) // elementary: SD / MI / SDLB / Package A
	replace school_years = 9 if inlist(educ, 5,6,7) // junior high : SMP/MTs/SMPLB/Package B
	replace school_years = 12 if inlist(educ, 8,9,10,11) // high (8-11): SMA / MA / SMLB / Paket C AND VOCATIONAL SCHOOL/MAK
	replace school_years = 14 if inlist(educ, 12,13) // Diploma I/II/III
	replace school_years = 16 if inlist(educ, 14) // Diploma IV and S1
	replace school_years = 18 if inlist(educ, 15,16) // S2/S3

* Dummy for educated (graduating high school)
  gen educated = school_years > 12 
  la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
  la val educated educated

* training courses
  tab b5_r1d, m
  gen train_certif = b5_r1d == 1

  tab b5_r1f, m
  gen current_training = b5_r1f == 1

* city
*  gen city_dummy = kode_kab >= 70 & !mi(kode_kab)

  gen born_kab = b5_r2b
*  gen born_live_same = born_kab == kode_kab

* migration (compare using both kabupaten and province)
  tab b5_r3a, m
  tab b5_r3b, m
*  gen migrated = (kode_prov !=  b5_r3a) | (kode_kab != b5_r3b)
*  tab migrated, m

* disabilities
  tab b5_r4a, m
  gen vision_disabled = b5_r4a == 2 | b5_r4a == 3
  gen vision_disability = b5_r4a

  tab b5_r4b, m
  gen hearing_disabled = b5_r4b == 5 | b5_r4b == 6
  gen hearing_disability = b5_r4b

  tab b5_r4c, m
  gen walk_disabled = b5_r4c == 2 | b5_r4c == 3
  gen walk_disability = b5_r4c

  tab b5_r4d, m
  gen hand_disabled = b5_r4d == 5 | b5_r4d == 6
  gen hand_disability = b5_r4d

  tab b5_r4e, m
  gen speech_disabled = b5_r4e == 2 | b5_r4e == 3
  gen speech_disability = b5_r4e

  tab b5_r4f, m
  gen other_disabled = b5_r4f == 5 | b5_r4f == 6
  gen other_disability = b5_r4f

  summ *disabled

  gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + other_disabled) >= 1
  tab any_disability, m

  gen severe_disability = b5_r4a == 3 | b5_r4b == 6 | b5_r4c == 3 | b5_r4d == 6 | b5_r4e == 3 | b5_r4f == 6
  tab severe_disability

* urban-rural
  tab klasifikas, m
  gen urban = klasifikas == 1


/*----------------------------------------------------*/
                /* Section: Employment */
/*----------------------------------------------------*/

* employed (worked at least 1 hour in past week)
  * employed if worked uninterrupted hour last week
  tab b5_r5a1, m
  gen employed = b5_r5a1 == 1

  * employed if usually works uninterrupted hour, but temporarily not at work
  tab b5_r6, m
  replace employed = 1 if b5_r6 == 1

  * employed if worked cumulative hour but not uninterrupted hour
  tab b5_r7a, m
  replace employed = 1 if b5_r7a == 1

  * employed if usually work cumulative hour, but temporarily not at work
  tab b5_r7b, m
  replace employed = 1 if b5_r7b == 1
  tab employed, m

* employment status (note: includes those who answered no to work for income, but answered that they did other work)
  // employment block asked of all who enumerator deem to have worked
  tab b5_r24a, m
  gen employment_status = b5_r24a
  label define status 0 "Not Working" 1 "Self-employed" 2 "Business owner with temporary/unpaid workers" 3 "Business owner with paid workers" 4 "Employee" 5 "Temporary worker in agriculture" 6 "Temporary worker (non-agriculture)" 7 "Family/unpaid worker", replace
  la val employment_status status
  tab employment_status, m

  * Type of work: the categories including permanent job, self employed, business owner, etc (12A)
  gen self_employed = employment_status == 1
  gen business_owner = inlist(employment_status, 2, 3)
  gen self_emp_bus_owner = inlist(employment_status, 1, 2, 3)
  gen perm_worker = employment_status == 4
  gen temp_worker = inlist(employment_status, 5, 6)
  gen family_unpaid = employment_status == 7

* were you employed last week?
  replace employed = inlist(employment_status, 1, 2, 3, 4, 5, 6)
  tab employed, m
  tab employed employment_status, m row

  * Note: 1 or 3 = yes, 2 or 4 = no, 0 = not asked
  gen reg_job_mkt = b5_r16a == 1
  gen contact_company = b5_r16b == 3
  gen advert_online = b5_r16d == 3
  gen search_network = b5_r16e == 1
  gen cap_loc_lic = b5_r16f == 3 | b5_r16g == 1 | b5_r16h == 3
  gen any_attempt = b5_r16i == 1

* Business field
  // 17-sector classification: (https://www.bps.go.id/statictable/2016/01/06/1898/klasifikasi-17-sektor-tabel-input-output-indonesia-2010.html)
  tab b5_r20_kat, m
  la def business_fields 1 "Agriculture, Forestry and Fisheries" 2 "Mining and excavation" 3 "Industrial Production" 4 "Electricity and Gas" ///
    5 "Water Supply, Waste Management, and Recycling" 6 "Construction" 7 "Wholesale and Retail Trade, Car and Motorcycle Repair" ///
    8 "Transportation and Warehousing" 9 "Hospitality and Restaurants" 10 "Infomation and Communication" 11 "Financial Services and Insurance" ///
    12 "Real Estate" 13 "Business Services" 14 "Government Administration, Defense and Social Security" 15 "Education" 16 "Health and Social Services" 17 "Other", replace
  gen business_field = b5_r20_kat if employed == 1
  la val business_field business_fields
  tab business_field, m

* KBJI code (https://www.bps.go.id/website/fileMenu/KBJI-2014.pdf)
  // this also doesn't seem to match the codes (there are some codes in the data that don't exist in the classification)
  tab b5_r21_kji, m
  tab b5_r21_kbj, m
  la def occupations 0 "Army and Police" 1 "Manager" 2 "Professional" 3 "Technicians and Assistant Professionals" 4 "Administrative Personnel" 5 "Business Services and Sales Personnel" 6 "Skilled Workers in Agriculture, Forestry and Fisheries" 7 "Production, Craft, and Related Workers" 8 "Machine Operators and Assembly Workers" 9 "Blue-collar workers", replace
  gen occupation = b5_r21_kbj if employed == 1
  la val occupation occupations
  tab occupation, m

* wage last month (assuming this is January)
  summ b5_r28b1 b5_r28b2 if inlist(employment_status, 1, 5, 6)
  summ b5_r28b1 b5_r28b2 if !inlist(employment_status, 1, 5, 6)
  tab b5_r28b1 if !inlist(employment_status, 1, 5, 6)

  summ b5_r28c1 b5_r28c2 if inlist(employment_status, 4)
  summ b5_r28c1 b5_r28c2 if !inlist(employment_status, 4)
  count if b5_r28c1 != 0 & !inlist(employment_status, 4)
  tab b5_r28c1 if  !inlist(employment_status, 4)

  // NOTE: there are a few respondents to these questions who should not have been asked it according to the survey flow
  // only 5 erroneous responses: seems like info was just entered under wrong question
  // one observation appears to duplicate the income, will delete the duplicate
  count if b5_r28b1 != 0 & b5_r28c1 != 0
  replace b5_r28b1 = 0 if b5_r28b1 != 0 & b5_r28c1 != 0

  gen earnings = b5_r28b1 + b5_r28b2 + b5_r28c1 + b5_r28c2 if inlist(employment_status, 1, 4, 5, 6)
  summ earnings if inlist(employment_status, 1, 4, 5, 6), d
  count if mi(earnings)

* Individual earnings, including zero if not employed
  gen ind_earnings = earnings
  replace ind_earnings = 0 if employed == 0
  sum ind_earnings, d

  gen ind_adj_earnings = ind_earnings
  sum ind_adj_earnings if ind_adj_earnings > 0, d
  replace ind_adj_earnings = r(p99) if ind_adj_earnings > r(p99) & !mi(ind_adj_earnings)
  replace ind_adj_earnings = r(p1) if ind_adj_earnings < r(p1) & !mi(ind_adj_earnings) & ind_adj_earnings != 0
  sum ind_adj_earnings, d
  gen ind_pos_adj_earnings = ind_adj_earnings if ind_adj_earnings > 0

  gen ind_earnings_pos = ind_earnings > 0 if !mi(ind_earnings)

* hours worked last week
  summ b5_r23a? if inlist(employment_status, 1, 4, 5, 6)
  summ b5_r23a if inlist(employment_status, 1, 4, 5, 6)
  di `r(max)'/7
  // max is 14 hours per day
  count if b5_r23a > (7*14)
  hist b5_r23a
  gen hours_worked = b5_r23a
  assert employed == 0 if mi(hours_worked)
  replace hours_worked = 0 if mi(hours_worked)
  summ hours_worked if hours_worked != 0

* hourly wage (income per day / hours per day)
* raw wages and hours
  gen hourly_wage_raw = (earnings / 31) / (hours_worked / 7)
  summ hourly_wage_raw, d

* Hourly wage, including business owners
* Income trimmed to 99th percentile
  gen hourly_wage = (ind_adj_earnings/31)/(hours_worked / 7) if hours_worked != 0
  replace hourly_wage = 0 if hours_worked == 0
  gen poswage = hourly_wage if hourly_wage > 0

* Trim 1% and 99% percentiles
  gen hourly_adj_wage = hourly_wage
  sum hourly_adj_wage if hourly_adj_wage > 0, d
  replace hourly_adj_wage = r(p99) if hourly_adj_wage > r(p99) & !mi(hourly_adj_wage)
  replace hourly_adj_wage = r(p1) if hourly_adj_wage < r(p1) & !mi(hourly_adj_wage) & hourly_adj_wage != 0
  sum hourly_adj_wage, d

* Positive wages
  gen poswage_adj = hourly_adj_wage if hourly_adj_wage > 0
  gen poswage_dummy = hourly_adj_wage > 0 if !mi(hourly_adj_wage)

* Replace wages as missing if no hours are worked
  gen hourly_wage_no0 = hourly_wage
  replace hourly_wage_no0 = . if hours_worked == 0

* Positive wages
  gen poswage_no0 = hourly_wage_no0 if hourly_wage_no0 > 0

* Trim 1% and 99% percentiles
  gen hourly_adj_wage_no0 = hourly_wage_no0
  sum hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0, d
  replace hourly_adj_wage_no0 = r(p99) if hourly_adj_wage_no0 > r(p99) & !mi(hourly_adj_wage_no0)
  sum hourly_adj_wage_no0, d
  gen poswage_adj_no0 = hourly_adj_wage_no0 if hourly_adj_wage_no0 > 0

  // Exchange rate: 14,403.60 rp/$
  // median monthly salary (assuming 40 hours/day, 5 days/week)
  di `r(p50)' * 4 * 5 * 8
  di `r(p50)' * 4 * 5 * 8 / 14400

* work tenure
  tab b5_r22a1, m
  tab b5_r22a2i, m
  tab b5_r22a2ii, m

  * indicators for whether job tenure is less than or greater than 12 months
  gen tenure_year_or_less = b5_r22a1 != 0
  gen tenure_year_more =  b5_r22a2i != 0
  tab tenure_year_or_less tenure_year_more

  * for month variable for those with tenure > 1 year, set to middle of year if value is 99 (assume this is don't know)
  gen b5_r22a2ii_no99 = b5_r22a2ii
  replace b5_r22a2ii_no99 = 6 if b5_r22a2ii == 99

  * create tenure in months
  // first the total months for those with tenure < 1 year
  gen tenure_months = b5_r22a1 if tenure_year_or_less == 1
  summ tenure_months

  // next the total months for those with tenure > 1 year
  replace tenure_months = (12 * b5_r22a2i) + b5_r22a2ii_no99 if tenure_year_more == 1
  summ tenure_months
  count if tenure_months == .
  replace tenure_months = 0 if tenure_months == .
  drop tenure_year_or_less tenure_year_more b5_r22a2ii_no99
  summ tenure_months, d

  * check
  gen age_months = 12 * age
  // subtract 48 months to get number of months one has been at least 4 years old (approximately)
  replace age_months = age_months - 48
  // check that respondents do not report being at same job since from before they were 4 years old
  summ age_months
  assert tenure_months < age_months if tenure_months != .

* use internet at work
  tab b5_r25b, m
  // make missing if not working
  gen use_internet_work = b5_r25b == 1
  replace use_internet_work = . if b5_r25a1 == 0
  replace use_internet_work = 0 if employed == 0 & mi(use_internet_work)
  tab use_internet_work

* had more than one job
  tab b5_r37a, m
  tab b5_r37b, m
  gen multiple_jobs = b5_r37a == 1 | b5_r37b == 1
  tab multiple_jobs, m

* leave job in past year
  tab b5_r48, m
  gen leave_job = b5_r48 == 1

* job search/business creation
  tab b5_r12a, m
  gen looking_job = b5_r12a == 1

* Did activities to search for a job (24) --individual variables but also create a variable that equals 1 if did any of the following
  gen activities_job = inlist(1, b5_r16a, b5_r16b, b5_r16c, b5_r16d, b5_r16e, b5_r16f, b5_r16g, b5_r16h, b5_r16i)
  label var activities_job "Any effort to find a job"

  tab b5_r12b, m
  gen prep_new_bus = b5_r12b == 1
  *bys hh_id_s: egen prep_bus_hh = max(prep_new_bus)

* labor force
  tab employed, m
  gen labor_force = employed == 1 | looking_job == 1 | prep_new_bus == 1
  tab labor_force, m

******************************************************
* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates 
	drop if anon_id4_dupe != 0 & !missing(anon_id4)
	drop anon_id4_dupe

/*----------------------------------------------------*/
              * Export cleaned data
/*----------------------------------------------------*/

  rename weightr_sp weight
  drop merge1 merge2
  datasignature 
  if "`r(datasignature)'" == "782729:272(102243):1066763926:3334665320" {
    save "$KP_deid_sakernas/Clean/sak_aug19_deid_clean.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
  

/*----------------------------------------------------*/
          * Merge with admin data and export
/*----------------------------------------------------*/

  keep if !mi(userid_bps) | !mi(anon_id4)

  drop b1* b2* b4* b5*

  gen sak_round = 3

 drop date_incentive
  
* merge
  preserve
	keep if !missing(anon_id4)
	tempfile sak_anon_id
	save `sak_anon_id'
  restore

  use "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", clear

  merge m:1 anon_id4 using `sak_anon_id', nogen keep(3)

	compress
  datasignature 
  if "`r(datasignature)'" == "179365:165(78234):795481401:3843912091" {
    save "$KP_deid_sakernas/Clean/sak_aug19_deid_clean_merged.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
	

  // Done
